Introduction

The function provides the correlation analysis of CMR genes with multiple gene features including Mean gene length, Mean exon number, Mean GC content and Mean distance to adjacent gene. To be specific, we split the maize genome in a sliding window of 100 adjacent genes with step size of 10, and calculate the frequency of CMR genes in each window (100 adjacent genes).

Row

Correlation of CMR genes with mean exon length

Statistics of linear regression of CMR genes with mean exon length

Stats values
r.squared 0.876679582823961
adj.r.squared 0.876663416015207
fstatistic 54227.1264638613 1 7628
sigma 4.98158040677552
df 2 7628 2

Anova analysis of CMR genes with mean exon length

group Residuals
Df 1.00 7628.00000
Sum Sq 1345708.14 189297.54147
Mean Sq 1345708.14 24.81614
F value 54227.13 NA
Pr(>F) 0.00 NA

Row

Exon length difference between CMR genes and non-CMR genes

The statistics of exon length difference between CMR genes and non-CMR genes

Stats CMR nonCMR
Min 204.0 90
First quantile 1617.0 645
Median 2567.5 1094
Third quantile 4535.0 1780
Max 8912.0 3482

Exon length density difference between CMR genes and non-CMR genes

---
title: Genomic Feature Analysis of CMR
output: 
  flexdashboard::flex_dashboard:
    orientation: rows
    social: menu
    source_code: embed
    theme: cosmo
---

```{r setup, include=FALSE}
library(flexdashboard)
library(rtracklayer)
library(plotly)
library(ggplot2)
library(seqinr)
library(GenomicFeatures)
library(knitr)
library(kableExtra)
options(scipen = 20)
knitr::opts_chunk$set(echo = TRUE,
                      warning = FALSE,
                      message = FALSE)
```


```{r echo = FALSE, warning = FALSE, message = FALSE}
library(flexdashboard)
# calculate exon, intron and gene length from txdb according to geneID
exonLength <- function(geneID, txdb){
  exonTxdb <- exonsBy(txdb, "gene")
  tmp <- exonTxdb[geneID]
  exonLen <- unlist(lapply(tmp,
                           function(x) sum(width(reduce(x)))))
  geneGR <- genes(txdb)
  geneLength <- width(geneGR)
  names(geneLength) <- geneGR$gene_id
  geneLength <- geneLength[geneID]
  resMat <- data.frame(geneID = geneID,
                       geneLength = geneLength,
                       exonLength = exonLen)
  resMat$intronLength <- resMat$geneLength - resMat$exonLength
  resMat
}
CMRGene <- read.table(file = CMRGeneDir, sep = "\t", header = F, quote = '',
                      stringsAsFactors = F)
CMRGene <- CMRGene$V1

GTF <- import(con = GTFDir)
GTF <- subset(GTF, GTF$gene_biotype == "protein_coding") # only protein coding gene are used for downstream analysis
txdb <- makeTxDbFromGFF(file = GTFDir) # from R package GenomicFeatures

seqNames <- na.omit(as.numeric(as.character(unique(seqnames(GTF)@values))))
GTFdf <- data.frame(GTF)

resFreq <- NULL
resExonLength <- NULL
for(i in 1:length(seqNames)){
  curGTF <- subset(GTFdf, GTFdf$seqnames == i)
  exonNumber <- by(data = curGTF, INDICES = curGTF[, "gene_id"],
                   function(x) length(na.omit(unique(x[,"exon_number"]))))
  curGeneGTF <- subset(curGTF, curGTF$type == "gene")
  curGeneGTF <- curGeneGTF[order(curGeneGTF$start), ]
  rownames(curGeneGTF) <- curGeneGTF$gene_id
  orderGene <- rownames(curGeneGTF)
  exonNumber <- exonNumber[orderGene]
  curLength <- length(exonNumber)
  curStart <- seq(1, curLength - seqLength + stepSize, stepSize)
  curEnd <- c(curStart[1:(length(curStart)-1)] + seqLength - 1, curLength)
  curMat <- cbind(curStart, curEnd)
  CMRFreq <- apply(curMat, 1,
                   function(x) length(intersect(names(exonNumber)[x[1]:x[2]],
                                                CMRGene))/seqLength)
  tmpFeature <- exonLength(geneID = orderGene, txdb = txdb)
  curExonLength <- apply(curMat, 1,
                         function(x) mean(tmpFeature[orderGene[x[1]:x[2]],"exonLength"]))
  resFreq <- c(resFreq, CMRFreq)
  resExonLength <- c(resExonLength, curExonLength)
}
```



### Introduction
The function provides the correlation analysis of CMR genes with multiple gene features including **Mean gene length**, **Mean exon number**, **Mean GC content** and **Mean distance to adjacent gene**. To be specific, we split the maize genome in a sliding window of 100 adjacent genes with step size of 10, and calculate the frequency of CMR genes in each window (100 adjacent genes).


Row {data-height=250}
---------------------------------------------------------------

### Correlation of CMR genes with mean exon length

```{r echo = FALSE, warning = FALSE, message = FALSE}
df <- data.frame(freq = resFreq*100, 
                 exonLength = resExonLength/1000)
p1 <- ggplot(df, aes(x = freq, y = exonLength)) + 
      geom_point(colour = "grey") + 
      geom_smooth(method = "lm", se = FALSE, colour = "red") + 
      labs(x = "Frequency of CMR genes (%)", y = "Mean exon length (1000 bp)")
ggplotly(p1)
```

### Statistics of linear regression of CMR genes with mean exon length

```{r echo = FALSE, warning = FALSE, message = FALSE}
group <- gl(n = 2, k = nrow(df), labels = c("freq", "exonLength"))
values <- c(df$freq, df$exonLength)
lmRes <- lm(values ~ group)
res.summary <- summary(lmRes)
res.table <- data.frame(Stats = c("r.squared",
                                  "adj.r.squared",
                                  "fstatistic",
                                  "sigma",
                                  "df"),
                        values = c(res.summary$r.squared,
                                   res.summary$adj.r.squared,
                                   paste(res.summary$fstatistic, collapse = " "),
                                   res.summary$sigma,
                                   paste(res.summary$df, collapse = " ")))
knitr::kable(x = res.table, align = 'c') %>%  
  kableExtra::kable_styling(position = 'center', full_width = FALSE, 
                            stripe_color = 'black', latex_options = "bordered")
```


### Anova analysis of CMR genes with mean exon length

```{r echo = FALSE, warning = FALSE, message = FALSE}
knitr::kable(x = t(anova(lmRes)), align = "c") %>%
  kableExtra::kable_styling(position = "center", full_width = FALSE,
                            stripe_color = "black", latex_options = "bordered")
```



Row {data-height=250}
---------------------------------------------------------------

```{r echo = FALSE, warning = FALSE, message = FALSE}
nonCMRGene <- setdiff(unique(GTFdf$gene_id), CMRGene)
nonCMRGene <- sample(nonCMRGene, size = ratio*length(CMRGene))

cmrFeature <- exonLength(geneID = CMRGene, txdb = txdb)
nonCMRFeature <- exonLength(geneID = nonCMRGene, txdb = txdb)
```

### Exon length difference between CMR genes and non-CMR genes
```{r echo = FALSE, warning=FALSE, message=FALSE}
cmrStat <- boxplot.stats(cmrFeature$exonLength)$stats
tmpCMR <- cmrFeature$exonLength[which(cmrFeature$exonLength <= cmrStat[5])]
noncmrStat <- boxplot.stats(nonCMRFeature$exonLength)$stats
tmpNonCMR <- nonCMRFeature$exonLength[which(nonCMRFeature$exonLength <= noncmrStat[5])]

df <- data.frame(type = factor(rep(c("CMRGene", "nonCMRGene"),
                                   c(length(tmpCMR), length(tmpNonCMR)))),
                 value = c(tmpCMR/1000, tmpNonCMR/1000))

p <- ggplot(df, aes(x=type, y=value, fill=type)) + 
  geom_violin(trim = FALSE) +
  theme(axis.title.x = element_blank(),
        #axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        legend.title = element_blank(),
        legend.position = "none")
ggplotly(p)
```

### The statistics of exon length difference between CMR genes and non-CMR genes
```{r echo = FALSE, warning=FALSE, message=FALSE}
res <- data.frame(Stats = c("Min", "First quantile", "Median", "Third quantile", "Max"),
                  CMR = cmrStat, nonCMR = noncmrStat)
knitr::kable(x = res, align = 'c') %>%  
  kableExtra::kable_styling(position = 'center', full_width = FALSE, 
                            stripe_color = 'black', latex_options = "bordered")
```


### Exon length density difference between CMR genes and non-CMR genes
```{r echo = FALSE, warning = FALSE, message = FALSE}
p <- ggplot(df, aes(x=value, fill=type)) + 
  geom_density(alpha = 0.5) +
  theme(axis.title.x = element_blank(),
        #axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        legend.title = element_blank(),
        legend.position = "none")
ggplotly(p)
```